Installing/ loading libraries
if(!require("quanteda")) {install.packages("quanteda"); library("quanteda")}
if(!require("lubridate")) {install.packages("readtext"); library("readtext")}
if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
if(!require("pdftools")) {install.packages("pdftools"); library("pdftools")}
if(!require("haven")) {install.packages("haven"); library("haven")}
if(!require("parameters")) {install.packages("parameters"); library("parameters")}
if(!require("performance")) {install.packages("performance"); library("performance")}
if(!require("see")) {install.packages("see"); library("see")}
theme_set(theme_light())## here() starts at C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining
## [1] 435
#(II)
#unused<<<<<<<<<<<<<<<<<<<<<
#result <-lapply(file.list, FUN = function(files) {
# pdf_text(files)
#})
#result <- lapply(file.list, pdftools::pdf_text)
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#make a table with 2 columns: the doc name & pdf-content
# combine text from each pdf into one string with paste0
filestextDF <- data.frame(Document = file.list,
text = sapply(file.list, function(x)
paste0(pdf_text(x), collapse = ' ')))
tb_pdf <- as_tibble(filestextDF)#extract ids (of length of 7 characters) for the docs to match with metadata later
#N= 435 docs
#Problem: multiple docs for same id (= multiple docs by same submitter)
tb_pdf$Document <- str_remove(tb_pdf$Document, "C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining/Data/Public_consultation_2020/files/")
ids <-substr(tb_pdf$Document, 1,7)
tb_pdf$id <- idslibrary(readr)
Public_consultation_2020 <- read_delim("./Data/Public_consultation_2020/files/Public_consultation_2020.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
consult_meta <- as_tibble(Public_consultation_2020)temp <- left_join(consult_meta, tb_pdf, by = c("Reference" = "id")) %>% as_tibble()
temp %>% filter(is.na(text)) #observations without pdf textRecoding the survey (First submission round)
#drop variables
#var column nr. 73: temp[,73]
temp <- temp %>%
dplyr::select(! `You can upload a document here:\n\n` ) %>%
dplyr::select(! `Publication privacy settings` )
#renaming variables
temp <- temp %>%
rename(filename = Document,
country = Country,
org = `Organisation name`,
id = Reference,
time = `Feedback date`,
lang = Language,
type = `User type`,
firstname = `First name`,
surname = Surname,
scope = Scope,
register = `Transparency register number`,
size = `Organisation size`) %>%
rename_with (~ 'coop_member_states', matches('Working with Member states')) %>%
rename_with (~ 'research_innov', matches('Focussing the efforts of the research and innovation community')) %>%
rename_with (~ 'skills', matches('\n: Skills')) %>%
rename_with (~ 'SME', matches('\n: Focus on SMEs')) %>%
rename_with (~ 'private_sector', matches('\n: Partnership with the private sector')) %>%
rename_with (~ 'public_sector', matches('\n: Promoting the adoption of AI by the public sector')) %>%
rename_with (~ 'other_action', matches('other actions that should be considered?')) %>%
rename_with (~ 'excel_research', matches('\n: Strengthen excellence in research')) %>%
rename_with (~ 'testing_fac', matches('Establish world-reference testing facilities for AI')) %>%
rename_with (~ 'uptake_ai', matches('Promote the uptake of AI by business and the public sector')) %>%
rename_with (~ 'startup_finance', matches('Increase the financing for start-ups innovating in AI')) %>%
rename_with (~ 'training_skills', matches('Develop skills for AI and adapt existing training programmes')) %>%
rename_with (~ 'eu_data_space', matches('Build up the European data space')) %>%
rename_with (~ 'other_area', matches('Are there other areas that that should be considered')) %>%
rename_with (~ 'lighthouse', matches('Support the establishment of a lighthouse research centre that is world class and able to attract the best minds')) %>%
rename_with (~ 'net_centres', matches('Network of existing AI research excellence centres')) %>%
rename_with (~ 'partner_research', matches('Set up a public-private partnership for industrial research')) %>% rename_with (~ 'action_research', matches('actions to strengthen the research and innovation community that should be given a priority')) %>%
rename_with (~ 'benefits_ai', matches('Help to raise SME’s awareness about potential benefits of AI')) %>%
rename_with (~ 'access_testing', matches('Provide access to testing and reference facilities')) %>%
rename_with (~ 'knowhow_transfer', matches('Promote knowledge transfer and support the development of AI expertise for SMEs')) %>%
rename_with (~ 'partner_aiproject', matches('Support partnerships between SMEs, larger enterprises and academia around AI projects')) %>%
rename_with (~ 'equity_finance', matches('Provide information about equity financing for AI startups')) %>%
rename_with (~ 'tasks_innovhub', matches('important for specialised Digital Innovations Hubs')) %>%
rename_with (~ 'concern_safety', matches('AI may endanger safety')) %>%
rename_with (~ 'concern_rights', matches('AI may breach fundamental rights')) %>%
rename_with (~ 'concern_safety', matches('AI may endanger safety')) %>%
rename_with (~ 'concern_discrim', matches('The use of AI may lead to discriminatory outcomes')) %>%
rename_with (~ 'concern_explain', matches('AI may take actions for which the rationale cannot be explained')) %>%
rename_with (~ 'concern_compensat', matches('AI may make it more difficult for persons having suffered harm to obtain compensation')) %>%
rename_with (~ 'concern_accuracy', matches('AI is not always accurate')) %>%
rename_with (~ 'concern_other', matches('Do you have any other concerns about AI that are not mentioned')) %>%
rename_with (~ 'leg_rules', matches('Do you think that the concerns expressed above can be addressed by applicable EU legislation')) names(temp)[44]<- "rules_other"
names(temp)[45]<- "rules_highrisk"
names(temp)[46]<- "mitigate_other"
names(temp)[47]<- "highrisk_approach"
names(temp)[48]<- "highrisk_other"
names(temp)[49]<- "highrisk_app"
names(temp)[50]<- "requir_qual_training_data"
names(temp)[51]<- "requir_record_data"
names(temp)[52]<- "requir_purpose"
names(temp)[53]<- "requir_robust_acc"
names(temp)[54]<- "requir_human_oversight"
names(temp)[55]<- "requir_liability"
names(temp)[56]<- "requir_biometric"
names(temp)[57]<- "requir_spec"
names(temp)[58]<- "label_aisystem"
names(temp)[59]<- "label_suggest"
names(temp)[60]<- "trust_spec"
names(temp)[61]<- "trust_enforce"
names(temp)[62]<- "compliance_spec"
names(temp)[63]<- "risk_spec"
names(temp)[64]<- "risk_reform"
names(temp)[65]<- "reform_assess"
names(temp)[65]<- "risk_procedure"
names(temp)[66]<- "risk_other"
names(temp)[67]<- "liability_reform"
names(temp)[68]<- "liabilty_further"
names(temp)[69]<- "liability_national"
names(temp)[70]<- "liabilty_app"
names(temp)[71]<- "liabilty_other"#cooperation member states (Likert scale 1-5 (not important - very important))
temp <- temp %>%
mutate(coop_member_states = case_when(
coop_member_states == "5 - Very important" ~ 5,
coop_member_states == "4 - Important" ~ 4,
coop_member_states == "3 - Neutral" ~ 3,
coop_member_states == "2 - Not important" ~ 2,
coop_member_states == "1 - Not important at all" ~ 1,
coop_member_states == "No opinion" ~ 0)
)c(summary(temp$coop_member_states)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$coop_member_states, na.rm = FALSE)) %>% round(digits = 2)## Min. Max. Mean sd
## 0.00 5.00 4.28 NA
hist(temp$coop_member_states, breaks = 60)# recoding research innovation focus (research_innov)
temp <- temp %>%
mutate(research_innov = case_when(
research_innov == "5 - Very important" ~ 5,
research_innov == "4 - Important" ~ 4,
research_innov == "3 - Neutral" ~ 3,
research_innov == "2 - Not important" ~ 2,
research_innov == "1 - Not important at all" ~ 1,
research_innov == "No opinion" ~ 0)
)#recoding skill (skills)
temp <- temp %>%
mutate(skills = case_when(
skills == "5 - Very important" ~ 5,
skills == "4 - Important" ~ 4,
skills == "3 - Neutral" ~ 3,
skills == "2 - Not important" ~ 2,
skills == "1 - Not important at all" ~ 1,
skills == "No opinion" ~ 0)
)#recoding SME (SME)
temp <- temp %>%
mutate(SME = case_when(
SME == "5 - Very important" ~ 5,
SME == "4 - Important" ~ 4,
SME == "3 - Neutral" ~ 3,
SME == "2 - Not important" ~ 2,
SME == "1 - Not important at all" ~ 1,
SME == "No opinion" ~ 0)
)#partnership w. private sector (private_sector)
temp <- temp %>%
mutate(private_sector = case_when(
private_sector == "5 - Very important" ~ 5,
private_sector == "4 - Important" ~ 4,
private_sector == "3 - Neutral" ~ 3,
private_sector == "2 - Not important" ~ 2,
private_sector == "1 - Not important at all" ~ 1,
private_sector == "No opinion" ~ 0)
)#partnership w. public_sector (public_sector)
temp <- temp %>%
mutate(public_sector = case_when(
public_sector == "5 - Very important" ~ 5,
public_sector == "4 - Important" ~ 4,
public_sector == "3 - Neutral" ~ 3,
public_sector == "2 - Not important" ~ 2,
public_sector == "1 - Not important at all" ~ 1,
public_sector == "No opinion" ~ 0)
)#Strengthen excellence in research (excel_research)
temp <- temp %>%
mutate(excel_research = case_when(
excel_research == "5 - Very important" ~ 5,
excel_research == "4 - Important" ~ 4,
excel_research == "3 - Neutral" ~ 3,
excel_research == "2 - Not important" ~ 2,
excel_research == "1 - Not important at all" ~ 1,
excel_research == "No opinion" ~ 0)
)#Establish world-reference testing facilities for AI (testing_fac)
temp <- temp %>%
mutate(testing_fac = case_when(
testing_fac == "5 - Very important" ~ 5,
testing_fac == "4 - Important" ~ 4,
testing_fac == "3 - Neutral" ~ 3,
testing_fac == "2 - Not important" ~ 2,
testing_fac == "1 - Not important at all" ~ 1,
testing_fac == "No opinion" ~ 0)
)#Promote the uptake of AI by business and the public sector (uptake_ai)
temp <- temp %>%
mutate(uptake_ai = case_when(
uptake_ai == "5 - Very important" ~ 5,
uptake_ai == "4 - Important" ~ 4,
uptake_ai == "3 - Neutral" ~ 3,
uptake_ai == "2 - Not important" ~ 2,
uptake_ai == "1 - Not important at all" ~ 1,
uptake_ai == "No opinion" ~ 0)
)#Increase the financing for start-ups innovating in AI (startup_finance)
temp <- temp %>%
mutate(startup_finance = case_when(
startup_finance == "5 - Very important" ~ 5,
startup_finance == "4 - Important" ~ 4,
startup_finance == "3 - Neutral" ~ 3,
startup_finance == "2 - Not important" ~ 2,
startup_finance == "1 - Not important at all" ~ 1,
startup_finance == "No opinion" ~ 0)
)c(summary(temp$startup_finance)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$startup_finance, na.rm = FALSE)) %>% round(digits = 2)## Min. Max. Mean sd
## 0.00 5.00 3.72 NA
hist(temp$startup_finance, breaks = 60)#Develop skills for AI and adapt existing training programmes (training_skills)
temp <- temp %>%
mutate(training_skills = case_when(
training_skills == "5 - Very important" ~ 5,
training_skills == "4 - Important" ~ 4,
training_skills == "3 - Neutral" ~ 3,
training_skills == "2 - Not important" ~ 2,
training_skills == "1 - Not important at all" ~ 1,
training_skills == "No opinion" ~ 0)
)#Build up the European data space (eu_data_space)
temp <- temp %>%
mutate(eu_data_space = case_when(
eu_data_space == "5 - Very important" ~ 5,
eu_data_space == "4 - Important" ~ 4,
eu_data_space == "3 - Neutral" ~ 3,
eu_data_space == "2 - Not important" ~ 2,
eu_data_space == "1 - Not important at all" ~ 1,
eu_data_space == "No opinion" ~ 0)
)#establishment of a lighthouse research centre (lighthouse)
temp <- temp %>%
mutate(lighthouse = case_when(
lighthouse == "5 - Very important" ~ 5,
lighthouse == "4 - Important" ~ 4,
lighthouse == "3 - Neutral" ~ 3,
lighthouse == "2 - Not important" ~ 2,
lighthouse == "1 - Not important at all" ~ 1,
lighthouse == "No opinion" ~ 0)
)#Network of existing AI research excellence centres (net_centres)
temp <- temp %>%
mutate(net_centres = case_when(
net_centres == "5 - Very important" ~ 5,
net_centres == "4 - Important" ~ 4,
net_centres == "3 - Neutral" ~ 3,
net_centres == "2 - Not important" ~ 2,
net_centres == "1 - Not important at all" ~ 1,
net_centres == "No opinion" ~ 0)
)#Set up a public-private partnership for industrial research (partner_research)
temp <- temp %>%
mutate(partner_research = case_when(
partner_research == "5 - Very important" ~ 5,
partner_research == "4 - Important" ~ 4,
partner_research == "3 - Neutral" ~ 3,
partner_research == "2 - Not important" ~ 2,
partner_research == "1 - Not important at all" ~ 1,
partner_research == "No opinion" ~ 0)
)#SMEs awareness about potential benefits of AI (benefits_ai)
temp <- temp %>%
mutate(benefits_ai = case_when(
benefits_ai == "5 - Very important" ~ 5,
benefits_ai == "4 - Important" ~ 4,
benefits_ai == "3 - Neutral" ~ 3,
benefits_ai == "2 - Not important" ~ 2,
benefits_ai == "1 - Not important at all" ~ 1,
benefits_ai == "No opinion" ~ 0)
)#Provide access to testing and reference facilities(access_testing)
temp <- temp %>%
mutate(access_testing = case_when(
access_testing == "5 - Very important" ~ 5,
access_testing == "4 - Important" ~ 4,
access_testing == "3 - Neutral" ~ 3,
access_testing == "2 - Not important" ~ 2,
access_testing == "1 - Not important at all" ~ 1,
access_testing == "No opinion" ~ 0)
)#Promote knowledge transfer and support the development of AI expertise for SMEs(knowhow_transfer)
temp <- temp %>%
mutate(knowhow_transfer = case_when(
knowhow_transfer == "5 - Very important" ~ 5,
knowhow_transfer == "4 - Important" ~ 4,
knowhow_transfer== "3 - Neutral" ~ 3,
knowhow_transfer == "2 - Not important" ~ 2,
knowhow_transfer == "1 - Not important at all" ~ 1,
knowhow_transfer == "No opinion" ~ 0)
)#partnerships between SMEs, larger enterprises and academia around AI projects(partner_aiproject)
temp <- temp %>%
mutate(partner_aiproject = case_when(
partner_aiproject == "5 - Very important" ~ 5,
partner_aiproject == "4 - Important" ~ 4,
partner_aiproject == "3 - Neutral" ~ 3,
partner_aiproject == "2 - Not important" ~ 2,
partner_aiproject == "1 - Not important at all" ~ 1,
partner_aiproject == "No opinion" ~ 0)
)#information about equity financing for AI startups(equity_finance)
temp <- temp %>%
mutate(equity_finance = case_when(
equity_finance == "5 - Very important" ~ 5,
equity_finance == "4 - Important" ~ 4,
equity_finance == "3 - Neutral" ~ 3,
equity_finance == "2 - Not important" ~ 2,
equity_finance == "1 - Not important at all" ~ 1,
equity_finance == "No opinion" ~ 0)
)#AI may endanger safety (concern_safety)
temp <- temp %>%
mutate(concern_safety = case_when(
concern_safety == "5 - Very important" ~ 5,
concern_safety == "4 - Important" ~ 4,
concern_safety == "3 - Neutral" ~ 3,
concern_safety == "2 - Not important" ~ 2,
concern_safety == "1 - Not important at all" ~ 1,
concern_safety == "No opinion" ~ 0)
)#AI may breach fundamental rights (such as human dignity, privacy, data protection (concern_rights)
temp <- temp %>%
mutate(concern_rights = case_when(
concern_rights == "5 - Very important" ~ 5,
concern_rights == "4 - Important" ~ 4,
concern_rights == "3 - Neutral" ~ 3,
concern_rights == "2 - Not important" ~ 2,
concern_rights == "1 - Not important at all" ~ 1,
concern_rights == "No opinion" ~ 0)
)#AI may lead to discriminatory outcomes (concern_discrim)
temp <- temp %>%
mutate(concern_discrim = case_when(
concern_discrim == "5 - Very important" ~ 5,
concern_discrim == "4 - Important" ~ 4,
concern_discrim == "3 - Neutral" ~ 3,
concern_discrim == "2 - Not important" ~ 2,
concern_discrim == "1 - Not important at all" ~ 1,
concern_discrim == "No opinion" ~ 0)
)#actions for which the rationale cannot be explained (concern_explain)
temp <- temp %>%
mutate(concern_explain = case_when(
concern_explain == "5 - Very important" ~ 5,
concern_explain == "4 - Important" ~ 4,
concern_explain == "3 - Neutral" ~ 3,
concern_explain == "2 - Not important" ~ 2,
concern_explain == "1 - Not important at all" ~ 1,
concern_explain == "No opinion" ~ 0)
)#difficult to obtain compensation(concern_compensat)
temp <- temp %>%
mutate(concern_compensat = case_when(
concern_compensat == "5 - Very important" ~ 5,
concern_compensat == "4 - Important" ~ 4,
concern_compensat == "3 - Neutral" ~ 3,
concern_compensat == "2 - Not important" ~ 2,
concern_compensat == "1 - Not important at all" ~ 1,
concern_compensat == "No opinion" ~ 0)
)#AI is not always accurate (concern_accuracy)
temp <- temp %>%
mutate(concern_accuracy = case_when(
concern_accuracy == "5 - Very important" ~ 5,
concern_accuracy == "4 - Important" ~ 4,
concern_accuracy == "3 - Neutral" ~ 3,
concern_accuracy == "2 - Not important" ~ 2,
concern_accuracy == "1 - Not important at all" ~ 1,
concern_accuracy == "No opinion" ~ 0)
)#quality of training data sets (requir_qual_training_data)
temp <- temp %>%
mutate(requir_qual_training_data = case_when(
requir_qual_training_data == "5 - Very important" ~ 5,
requir_qual_training_data == "4 - Important" ~ 4,
requir_qual_training_data == "3 - Neutral" ~ 3,
requir_qual_training_data == "2 - Not important" ~ 2,
requir_qual_training_data == "1 - Not important at all" ~ 1,
requir_qual_training_data == "No opinion" ~ 0)
)#keeping of records and data(requir_record_data)
temp <- temp %>%
mutate(requir_record_data = case_when(
requir_record_data == "5 - Very important" ~ 5,
requir_record_data == "4 - Important" ~ 4,
requir_record_data == "3 - Neutral" ~ 3,
requir_record_data == "2 - Not important" ~ 2,
requir_record_data == "1 - Not important at all" ~ 1,
requir_record_data == "No opinion" ~ 0)
)# Info on the purpose and the nature of AI systems (requir_purpose)
temp <- temp %>%
mutate(requir_purpose = case_when(
requir_purpose == "5 - Very important" ~ 5,
requir_purpose == "4 - Important" ~ 4,
requir_purpose == "3 - Neutral" ~ 3,
requir_purpose == "2 - Not important" ~ 2,
requir_purpose == "1 - Not important at all" ~ 1,
requir_purpose == "No opinion" ~ 0)
)# Robustness and accuracy of AI systems (requir_robust_acc)
temp <- temp %>%
mutate(requir_robust_acc = case_when(
requir_robust_acc == "5 - Very important" ~ 5,
requir_robust_acc == "4 - Important" ~ 4,
requir_robust_acc == "3 - Neutral" ~ 3,
requir_robust_acc == "2 - Not important" ~ 2,
requir_robust_acc == "1 - Not important at all" ~ 1,
requir_robust_acc == "No opinion" ~ 0)
)#Clear liability and safety rules (requir_liability)
temp <- temp %>%
mutate(requir_liability = case_when(
requir_liability == "5 - Very important" ~ 5,
requir_liability == "4 - Important" ~ 4,
requir_liability == "3 - Neutral" ~ 3,
requir_liability == "2 - Not important" ~ 2,
requir_liability == "1 - Not important at all" ~ 1,
requir_liability == "No opinion" ~ 0)
)#Human oversight (requir_human_oversight)
temp <- temp %>%
mutate(requir_human_oversight = case_when(
requir_human_oversight == "5 - Very important" ~ 5,
requir_human_oversight == "4 - Important" ~ 4,
requir_human_oversight == "3 - Neutral" ~ 3,
requir_human_oversight == "2 - Not important" ~ 2,
requir_human_oversight == "1 - Not important at all" ~ 1,
requir_human_oversight == "No opinion" ~ 0)
)c(summary(temp$requir_human_oversight)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$requir_human_oversight, na.rm = FALSE)) %>% round(digits = 2)## Min. Max. Mean sd
## 0.00 5.00 4.39 NA
hist(temp$requir_human_oversight, breaks = 60)tidy_df1 <-temp %>% unite("person", firstname:surname, sep = " ")
#add column indicative for first consultation round
tidy_df1 <- tidy_df1 %>%
mutate(consult_round = "one")tidy_df1 <- tidy_df1 %>%
mutate(type = recode(type, #old value = new value
`NGO (Non-governmental organisation)` = "Non-governmental organisation (NGO)",
`Academic/Research Institution` = "Academic/research Institution",
`EU Citizen` = "EU citizen" ,
`Company/Business organisation` = "Company/business organisation",
`Consumer Organisation` = "Consumer organisation",
`Trade Union` = "Trade union",
`Business Association` = "Business association"
))tidy_df1 <- tidy_df1 %>%
mutate(size = recode(size, #old value = new value
`Medium (< 250 employees)` = "Medium (50 to 249 employees)",
`Small (< 50 employees)` = "Small (10 to 49 employees)",
`Micro (< 10 employees)` = "Micro (1 to 9 employees)"))roadmap_2020 and final round comission_adoption_2021library(readr)
commission_adoption_2021 <- read_csv("./Augmented_data/commission_adoption_2021.csv")## Rows: 304 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Feedback reference, Submitted on, Submitted by, User type, Organis...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
roadmap_2020 <- read_csv("./Augmented_data/roadmap_2020.csv")## Rows: 123 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Feedback reference, Submitted on, Submitted by, User type, Organis...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#alternative library(janitor) roadmap %>% clean_names()
scrap20 <- roadmap_2020 %>%
rename(country = `Country of origin`,
id = `Feedback reference`,
time = `Submitted on` ,
person = `Submitted by` ,
type = `User type` ,
org = Organisation,
size = `Organisation size` ,
register = `Transparency register number`,
initiative = Initiative,
abstract = Paragraph,
text = pdf) %>%
mutate(time = dmy(time)) %>%
mutate(type = recode(type, #old value = new value
`NRO (Nichtregierungsorganisation)` = "Non-governmental organisation (NGO)",
`Universität/Forschungseinrichtung` = "Academic/research Institution",
`EU-Bürger/-in` = "EU citizen" ,
`Sonstiges` = "Other",
`Unternehmen/Unternehmensverband` = "Company/business organisation",
`Verbraucherverband` = "Consumer organisation",
`Behörde` = "Public authority",
`Gewerkschaft` = "Trade union",
`Wirtschaftsverband` = "Business association",
`-` = "Missing"
)) %>%
mutate(size = recode(size, #old value = new value
`mittel (50 bis 249 Beschäftigte)` = "Medium (50 to 249 employees)",
`klein (10 bis 49 Beschäftigte)` = "Small (10 to 49 employees)",
`groß (250 oder mehr Beschäftigte)` = "Large (250 or more)",
`-` = "Missing",
`sehr klein (1 bis 9 Beschäftigte)` = "Micro (1 to 9 employees)")) %>% #I need to find this workaround, the above procedure did not function
mutate(size = case_when(str_detect(size, "mittel") ~ "Medium (50 to 249 employees)", TRUE ~ size)) %>%
mutate(size = case_when(str_detect(size, "klein") ~ "Small (10 to 49 employees)", TRUE ~ size)) %>%
mutate(size = case_when(str_detect(size, "sehr") ~ "Micro (1 to 9 employees)", TRUE ~ size)) %>%
mutate(country = recode(country,
`Vereinigten Staaten` = "United States",
`Belgien` = "Belgium",
`Slowakei` = "Slovakia",
`Italien` = "Italy",
`Niederlande` = "Netherlands",
`Dänemark` = "Denmark",
`Vereinigtes Königreich` = "United Kingdom",
`Frankreich` = "France",
`-` = "Missing",
`international` = "Other",
`Spanien` = "Spain",
`Österreich` = "Austria",
`Schweden` = "Sweden",
`Polen` = "Poland",
`Irland` = "Ireland",
`Finnland` = "Finland",
`Deutschland` = "Germany",
`Ungarn` = "Hungary",
`Tschechien` = "Czech Republic",
`Rumänien` = "Romania",
`Bulgarien` = "Bulgaria"))
scrap21 <-commission_adoption_2021 %>%
rename(country = `Country of origin`,
id = `Feedback reference`,
time = `Submitted on` ,
person = `Submitted by` ,
type = `User type` ,
org = Organisation,
size = `Organisation size` ,
register = `Transparency register number`,
initiative = Initiative,
abstract = Paragraph,
text = pdf) %>%
mutate(time = dmy(time)) %>%
mutate(type = recode(type,
`Ukyo Mori` = "Other",
`Johannes Kröhnert` = "Other",
`-` = "Missing")) %>%
mutate(country = recode(country,
`Regional` = "Other",
`Local` = "Other",
`feedback.usertype.company` = "Other",
`feedback.usertype.business_association` = "Other",
`National` = "Other")) %>%
mutate(size = recode(size,
`-` = "Missing"))
scrap20 <- scrap20 %>% mutate(consult_round = "two")
scrap21 <- scrap21 %>% mutate(consult_round = "three")#problem with scrap 20 and the ids: F550611 and F550610 they are doubles (with empty abstract and text section) complete entry is: F550619
# scrap 20 hast 123 rows but should have 133 !
#after filtering : 121 rows
scrap20 <- scrap20 %>%
filter (id != "F550611", id != "F550610")#problem2: missing on all variables
scrap20 %>% filter(is.na(abstract))scrap20 %>% filter(id == "-")scrap20 <- scrap20 %>% filter(id !="-")#there are n = 85 pdfs in the folder but only n = 69 [text] in the csv
#there are n = 49 entries with only an abstract but no text: filter(!is.na(abstract), is.na(text))
scrap20 %>% filter(!is.na(text))submission <- rbind(scrap20,scrap21)with different cell and column numbers
they share: id, time, person, type, org, size, register, country, text
tidy_df1$time<- as.Date(tidy_df1$time, "%d.%m.%Y")tidy_df1 <- tidy_df1 %>% relocate(person, .after = time )tidy_df1 <- tidy_df1 %>% relocate(type, .after = person )tidy_df1 <- tidy_df1 %>% relocate(org, .after = type )tidy_df1 <- tidy_df1 %>% relocate(size, .after = org )tidy_df1 <- tidy_df1 %>% relocate(register, .after = size )tidy_df1 <- tidy_df1 %>% relocate(country, .after = register )tidy_df1 <- tidy_df1 %>% relocate(text, .after = country )submission <- submission %>% relocate(text, .after = country )###Merge all 3 data frames together
three_submission <- full_join( tidy_df1, submission, by = c("id", "time", "type", "size", "org", "register", "text", "consult_round", "person", "country"))#size
three_submission <- three_submission %>%
mutate(size = case_when(
size == "Large (250 or more)" ~ 4,
size == "Medium (50 to 249 employees)" ~ 3,
size == "Small (10 to 49 employees)" ~ 2,
size == "Micro (1 to 9 employees)" ~ 1,
size == "Missing" ~ 0)
)library(pdftools) # to read in pdfs
library(tidytext) # to tokenize text, remove stop words, and calculate tfidf## Warning: Paket 'tidytext' wurde unter R Version 4.1.3 erstellt
library(tidyverse) # to wrangle data, count words, and plot data
library(textclean) # to clean up text a bit, removing non-ascii chars etc.
consult_text_clean <- three_submission %>%
mutate(text = str_trim(text), #trim leading and trailing white space
text = replace_url(text), #remove URLs from text
text = replace_non_ascii(text), #remove non-ascii characters
text = replace_symbol(text), #replace $ and other characters with word replacements
text = str_remove_all(text, "[0-9]+"), #remove numbers
text = str_remove_all(text, "[[:punct:]]+"),
text = str_replace(text, "Ref Ares", ""),
text = str_squish(text)) #remove extra white space from text (e.g., line breaks)) #remove punctuationsaveRDS(consult_text_clean, "consult_text_clean.rds")
write_csv(consult_text_clean, "consult_text_clean.csv")#custom_stop_words <- tibble(word = c("canada", "canadas", "report", "cent", "gouvqcca", "crimi", "nal")) #words that may appear frequently on certain pages, but that we don't want to keep.
consult_tokens <- consult_text_clean %>%
unnest_tokens(word, text, token = "words", to_lower = TRUE) %>%
anti_join(stop_words) %>% #remove English stop words (e.g., I, a, the)
# anti_join(custom_stop_words) %>% #remove our custom stop words
# filter(n > 50) %>% #keep only pages with more than 100 words
count(id, word) #count the number of times each word appears on each page. We'll need this to calculate tf-idf in the next step.## Joining, by = "word"
Deal with missing variables
library(naniar)## Warning: Paket 'naniar' wurde unter R Version 4.1.3 erstellt
sum(is.na(consult_text_clean$text))## [1] 895
gg_miss_var(three_submission)## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## i Please use "none" instead.
## i The deprecated feature was likely used in the naniar package.
## Please report the issue at <https://github.com/njtierney/naniar/issues>.
library(janitor)## Warning: Paket 'janitor' wurde unter R Version 4.1.3 erstellt
##
## Attache Paket: 'janitor'
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## chisq.test, fisher.test
consult_text_clean %>%
tabyl(country) %>%
adorn_totals("row") %>%
adorn_pct_formatting() %>%
knitr::kable()| country | n | percent | valid_percent |
|---|---|---|---|
| Albania | 1 | 0.1% | 0.1% |
| Austria | 28 | 1.7% | 2.3% |
| Belgium | 265 | 16.0% | 22.2% |
| Brazil | 1 | 0.1% | 0.1% |
| Bulgaria | 7 | 0.4% | 0.6% |
| Canada | 4 | 0.2% | 0.3% |
| China | 2 | 0.1% | 0.2% |
| Costa Rica | 1 | 0.1% | 0.1% |
| Côte d’Ivoire | 1 | 0.1% | 0.1% |
| Croatia | 2 | 0.1% | 0.2% |
| Cyprus | 1 | 0.1% | 0.1% |
| Czech Republic | 8 | 0.5% | 0.7% |
| Denmark | 22 | 1.3% | 1.8% |
| Finland | 24 | 1.5% | 2.0% |
| France | 108 | 6.5% | 9.1% |
| Germany | 187 | 11.3% | 15.7% |
| Greece | 7 | 0.4% | 0.6% |
| Hungary | 4 | 0.2% | 0.3% |
| India | 5 | 0.3% | 0.4% |
| Iraq | 1 | 0.1% | 0.1% |
| Ireland | 16 | 1.0% | 1.3% |
| Italy | 46 | 2.8% | 3.9% |
| Japan | 8 | 0.5% | 0.7% |
| Latvia | 1 | 0.1% | 0.1% |
| Lithuania | 2 | 0.1% | 0.2% |
| Luxembourg | 3 | 0.2% | 0.3% |
| Malta | 3 | 0.2% | 0.3% |
| Missing | 11 | 0.7% | 0.9% |
| Netherlands | 64 | 3.9% | 5.4% |
| Norway | 5 | 0.3% | 0.4% |
| Other | 11 | 0.7% | 0.9% |
| Poland | 23 | 1.4% | 1.9% |
| Portugal | 18 | 1.1% | 1.5% |
| Romania | 14 | 0.8% | 1.2% |
| Slovakia | 5 | 0.3% | 0.4% |
| South Korea | 1 | 0.1% | 0.1% |
| Spain | 70 | 4.2% | 5.9% |
| Sweden | 29 | 1.8% | 2.4% |
| Switzerland | 16 | 1.0% | 1.3% |
| Turkey | 1 | 0.1% | 0.1% |
| United Kingdom | 78 | 4.7% | 6.5% |
| United States | 88 | 5.3% | 7.4% |
| NA | 462 | 27.9% | - |
| Total | 1654 | 100.0% | 100.0% |